# set a random seed & shuffle data frame
set.seed(1234)
diseaseInfo <- diseaseInfo[sample(1:nrow(diseaseInfo)), ]
head(diseaseInfo)
## # A tibble: 6 x 24
##       Id source latitude longitude region country admin1 localityName
##    <dbl> <chr>     <dbl>     <dbl> <chr>  <chr>   <chr>  <chr>       
## 1 219318 Natio…     28.4     46.0  Asia   Saudi … Easte… Hafr-Elbatin
## 2 219097 OIE        45.6     11.4  Europe Italy   Veneto CASTELGOMBE…
## 3 219828 OIE        52.4     23.2  Europe Poland  Podla… Adamowo     
## 4 221042 OIE        36.6     10.7  Africa Tunisia Nabeul Beni khalled
## 5 217753 OIE        46.1      4.43 Europe France  Rhone… POULE LES E…
## 6 228469 OIE        35.3    129.   Asia   Republ… Kyong… Gijang-gun  
## # … with 16 more variables: localityQuality <chr>, observationDate <chr>,
## #   reportingDate <chr>, status <chr>, disease <chr>, serotypes <chr>,
## #   speciesDescription <chr>, sumAtRisk <dbl>, sumCases <dbl>, sumDeaths <dbl>,
## #   sumDestroyed <dbl>, sumSlaughtered <dbl>, humansGenderDesc <chr>,
## #   humansAge <dbl>, humansAffected <dbl>, humansDeaths <dbl>
# get the subset of the dataframe that doesn't have labels about 
# humans affected by the disease
diseaseInfo_humansRemoved <- diseaseInfo %>%
    select(-starts_with("human"))

labels

# get a boolean vector of training labels
diseaseLabels <- diseaseInfo %>%
    select(humansAffected) %>% # get the column with the # of humans affected
    is.na() %>% # is it NA?
    magrittr::not() # switch TRUE and FALSE (using function from the magrittr package)

# check out the first few lines
head(diseaseLabels) # of our target variable
##      humansAffected
## [1,]           TRUE
## [2,]          FALSE
## [3,]          FALSE
## [4,]          FALSE
## [5,]          FALSE
## [6,]          FALSE
head(diseaseInfo$humansAffected) # of the original column
## [1]  1 NA NA NA NA NA
# select just the numeric columns
diseaseInfo_numeric <- diseaseInfo_humansRemoved %>%
    select(-Id) %>% # the case id shouldn't contain useful information
    select(-c(longitude, latitude)) %>% # location data is also in country data
    select_if(is.numeric) # select remaining numeric columns

# make sure that our dataframe is all numeric
str(diseaseInfo_numeric)
## Classes 'tbl_df', 'tbl' and 'data.frame':    17008 obs. of  5 variables:
##  $ sumAtRisk     : num  NA 53 NA 61 93 12 103 49 13 NA ...
##  $ sumCases      : num  NA 4 1 1 1 NA 1 9 10 1 ...
##  $ sumDeaths     : num  NA 0 1 0 0 6 NA 0 10 0 ...
##  $ sumDestroyed  : num  NA 0 0 0 0 6 NA 0 3 1 ...
##  $ sumSlaughtered: num  NA 0 0 0 0 NA NA 0 0 0 ...
head(diseaseInfo$country)
## [1] "Saudi Arabia"      "Italy"             "Poland"           
## [4] "Tunisia"           "France"            "Republic of Korea"
# one-hot matrix for just the first few rows of the "country" column
model.matrix(~country-1,head(diseaseInfo))
##   countryFrance countryItaly countryPoland countryRepublic of Korea
## 1             0            0             0                        0
## 2             0            1             0                        0
## 3             0            0             1                        0
## 4             0            0             0                        0
## 5             1            0             0                        0
## 6             0            0             0                        1
##   countrySaudi Arabia countryTunisia
## 1                   1              0
## 2                   0              0
## 3                   0              0
## 4                   0              1
## 5                   0              0
## 6                   0              0
## attr(,"assign")
## [1] 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$country
## [1] "contr.treatment"
# convert categorical factor into one-hot encoding
region <- model.matrix(~country-1,diseaseInfo)
# some of the species
head(diseaseInfo$speciesDescription)
## [1] NA                                                 
## [2] "domestic, cattle"                                 
## [3] "wild, wild boar"                                  
## [4] "domestic, cattle, domestic, goat, domestic, sheep"
## [5] "domestic, cattle"                                 
## [6] "domestic, unspecified bird"
# add a boolean column to our numeric dataframe indicating whether a species is domestic
diseaseInfo_numeric$is_domestic <- str_detect(diseaseInfo$speciesDescription, "domestic")
# get a list of all the species by getting the last
speciesList <- diseaseInfo$speciesDescription %>%
    str_replace("[[:punct:]]", "") %>% # remove punctuation (some rows have parentheses)
    str_extract("[a-z]*$") # extract the least word in each row

# convert our list into a dataframe...
speciesList <- tibble(species = speciesList)

# and convert to a matrix using 1 hot encoding
options(na.action='na.pass') # don't drop NA values!
species <- model.matrix(~species-1,speciesList)
# add our one-hot encoded variable and convert the dataframe into a matrix
diseaseInfo_numeric <- cbind(diseaseInfo_numeric, region, species)
diseaseInfo_matrix <- data.matrix(diseaseInfo_numeric)
# get the numb 70/30 training test split
numberOfTrainingSamples <- round(length(diseaseLabels) * .7)

# training data
train_data <- diseaseInfo_matrix[1:numberOfTrainingSamples,]
train_labels <- diseaseLabels[1:numberOfTrainingSamples]

# testing data
test_data <- diseaseInfo_matrix[-(1:numberOfTrainingSamples),]
test_labels <- diseaseLabels[-(1:numberOfTrainingSamples)]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = train_data, label= train_labels)
dtest <- xgb.DMatrix(data = test_data, label= test_labels)

Training model

# train a model using our training data
model <- xgboost(data = dtrain, # the data   
                 nround = 2, # max number of boosting iterations
                 objective = "binary:logistic")  # the objective function
## [1]  train-error:0.013943 
## [2]  train-error:0.013943
# generate predictions for our held-out testing data
pred <- predict(model, dtest)

# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"

Turn our model

# train an xgboost model
model_tuned <- xgboost(data = dtrain, # the data           
                 max.depth = 3, # the maximum depth of each decision tree
                 nround = 2, # max number of boosting iterations
                 objective = "binary:logistic") # the objective function 
## [1]  train-error:0.013943 
## [2]  train-error:0.013943
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)

# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"
# get the number of negative & positive cases in our data
negative_cases <- sum(train_labels == FALSE)
postive_cases <- sum(train_labels == TRUE)

# train a model using our training data
model_tuned <- xgboost(data = dtrain, # the data           
                 max.depth = 3, # the maximum depth of each decision tree
                 nround = 10, # number of boosting rounds
                 early_stopping_rounds = 3, # if we dont see an improvement in this many rounds, stop
                 objective = "binary:logistic", # the objective function
                 scale_pos_weight = negative_cases/postive_cases) # control for imbalanced classes
## [1]  train-error:0.014195 
## Will train until train_error hasn't improved in 3 rounds.
## 
## [2]  train-error:0.014195 
## [3]  train-error:0.013859 
## [4]  train-error:0.013859 
## [5]  train-error:0.013859 
## [6]  train-error:0.013859 
## Stopping. Best iteration:
## [3]  train-error:0.013859
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)

# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"
# train a model using our training data
model_tuned <- xgboost(data = dtrain, # the data           
                 max.depth = 3, # the maximum depth of each decision tree
                 nround = 10, # number of boosting rounds
                 early_stopping_rounds = 3, # if we dont see an improvement in this many rounds, stop
                 objective = "binary:logistic", # the objective function
                 scale_pos_weight = negative_cases/postive_cases, # control for imbalanced classes
                 gamma = 1) # add a regularization term
## [1]  train-error:0.014195 
## Will train until train_error hasn't improved in 3 rounds.
## 
## [2]  train-error:0.014195 
## [3]  train-error:0.013859 
## [4]  train-error:0.013859 
## [5]  train-error:0.013859 
## [6]  train-error:0.013859 
## Stopping. Best iteration:
## [3]  train-error:0.013859
# generate predictions for our held-out testing data
pred <- predict(model_tuned, dtest)

# get & print the classification error
err <- mean(as.numeric(pred > 0.5) != test_labels)
print(paste("test-error=", err))
## [1] "test-error= 0.0139161113288906"

Examining the model

# plot them features! what's contributing most to our model?
xgb.plot.multi.trees(feature_names = names(diseaseInfo_matrix), model = model)
## Column 2 ['No'] of item 2 is missing in item 1. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names. use.names='check' (default from v1.12.2) emits this message and proceeds as if use.names=FALSE for  backwards compatibility. See news item 5 in v1.12.2 for options to control this message.
# convert log odds to probability
odds_to_probs <- function(odds){
    return(exp(odds)/ (1 + exp(odds)))
}


# probability of leaf above countryPortugul
odds_to_probs(-0.599)
## [1] 0.3545725
# get information on how important each feature is
importance_matrix <- xgb.importance(names(diseaseInfo_matrix), model = model)

# and plot it!
xgb.plot.importance(importance_matrix)